

db.handle <- odbcDriverConnect('server=vhacdwa01.vha.med.va.gov; database=CDWWork; driver={SQL server}; trusted_connection=true')

# FIRST appointment request associated with 1010-EZ without prior care
pcp_claims <- sqlQuery(db.handle, "SELECT * FROM OMHO_QFR.ECON.appointments")

pcp_claims <- data.table(pcp_claims)
pcp_claims$VisitDate <- as.Date(pcp_claims$VisitDateTime, "%Y-%m-%d")
pcp_claims$AppointmentRequestDate <- as.Date(pcp_claims$AppointmentRequestDate, "%Y-%m-%d")
pcp_claims$DesiredAppointmentDate <- as.Date(pcp_claims$DesiredAppointmentDate, "%Y-%m-%d")
pcp_claims$AppointmentMadeDate <- as.Date(pcp_claims$AppointmentMadeDate, "%Y-%m-%d")
pcp_claims$AppointmentDate <- as.Date(pcp_claims$AppointmentDate, "%Y-%m-%d")
pcp_claims$DesiredAppointmentDate[which(pcp_claims$DesiredAppointmentDate>as.Date("2018-03-03", "%Y-%m-%d"))] <- NA
pcp_claims$DesiredDayOfWeek <- weekdays(pcp_claims$DesiredAppointmentDate)
pcp_claims$WaitTime <- as.numeric(pcp_claims$AppointmentDate-pcp_claims$DesiredAppointmentDate)
pcp_claims$TimeOut <- as.numeric(pcp_claims$DesiredAppointmentDate-pcp_claims$AppointmentMadeDate)
pcp_claims$TotalTime <- as.numeric(pcp_claims$AppointmentDate-pcp_claims$AppointmentRequestDate)
pcp_claims$ContactTime <- as.numeric(pcp_claims$AppointmentMadeDate-pcp_claims$AppointmentRequestDate)

# Drop females:
pcp_claims <- pcp_claims[Gender=="M"]

# Drop top 1% of wait and total time (corresponds to roughly 50 and 100 respectively)
pcp_claims <- pcp_claims[WaitTime>=0 & WaitTime<=50 & TotalTime>=0 & TotalTime<=100]

# There are a few hundred patients that are in an inpatient setting during their assigned PCP. Most are going to see their PCP for mental illness
pcp_claims$PV_DateDiff[which(is.na(pcp_claims$PV_DateDiff))] <- Inf
pcp_claims <- pcp_claims[!(PV_OutInp=="inpat" & PV_DateDiff<0)]
pcp_claims$PriorVisit <- (pcp_claims$PV_DateDiff<=365)

# Get FIRST EVER Enrollment
first_enrollment <- sqlQuery(db.handle, "
SELECT DISTINCT 
  cohort.PatientICN, EnrollmentDate
FROM CDWWork.Patient.Enrollment AS a
INNER JOIN CDWWork.Patient.Patient AS pat
  ON a.PatientSID=pat.PatientSID
INNER JOIN OMHO_QFR.ECON.appointments AS cohort
  ON pat.PatientICN=cohort.PatientICN
WHERE EnrollmentDate<=CAST('2017-04-01 00:00:00' AS datetime2(0))")

first_enrollment$EnrollmentDate <- as.Date(first_enrollment$EnrollmentDate, "%Y-%m-%d")
first_enrollment <- first_enrollment[order(first_enrollment$PatientICN, first_enrollment$EnrollmentDate),]# take first first_enrollment date
first_enrollment <- first_enrollment[!duplicated(first_enrollment$PatientICN),]

pcp_claims <- merge(x=pcp_claims, y=first_enrollment, by.x="PatientICN", by.y="PatientICN", all.x=T)
pcp_claims$BenefitLength <- as.numeric(pcp_claims$AppointmentRequestDate-pcp_claims$EnrollmentDate) 
rm(first_enrollment)


# Get most RECENT Enrollment
recent_enrollment <- sqlQuery(db.handle, "
SELECT 
  PatientICN, EnrollmentPriority, AnnualVACheckAmount, MedicaidFlag, ExposedToAgentOrangeFlag, RadiationExposureIndicatedFlag, UnemployableFlag
FROM(
  SELECT
    cohort.PatientICN, EffectiveDate, EnrollmentPriority, AnnualVACheckAmount, MedicaidFlag, ExposedToAgentOrangeFlag, RadiationExposureIndicatedFlag, UnemployableFlag,
    ROW_NUMBER() OVER(PARTITION BY cohort.PatientICN ORDER BY EffectiveDate DESC) AS row_num
  FROM CDWWork.Patient.Enrollment AS a
  INNER JOIN CDWWork.Patient.Patient AS pat
    ON a.PatientSID=pat.PatientSID
  INNER JOIN OMHO_QFR.ECON.appointments AS cohort
    ON pat.PatientICN=cohort.PatientICN
  WHERE EffectiveDate<=CAST(cohort.VisitDateTime AS date)
) AS z 
WHERE row_num=1")


# It appears most missing ServiceConnectedPercentage are in PriorityGroups 3-8 so zero
recent_enrollment$AnnualVACheckAmount[which(is.na(recent_enrollment$AnnualVACheckAmount))] <- 0
recent_enrollment$ExposedToAgentOrangeFlag <- (recent_enrollment$ExposedToAgentOrangeFlag=="Y")
recent_enrollment$ExposedToAgentOrangeFlag[which(is.na(recent_enrollment$ExposedToAgentOrangeFlag))] <- 0
recent_enrollment$RadiationExposureIndicatedFlag <- (recent_enrollment$RadiationExposureIndicatedFlag=="Y")
recent_enrollment$RadiationExposureIndicatedFlag[which(is.na(recent_enrollment$RadiationExposureIndicatedFlag))] <- 0
recent_enrollment$UnemployableFlag <- (recent_enrollment$UnemployableFlag=="Y")
recent_enrollment$UnemployableFlag[which(is.na(recent_enrollment$UnemployableFlag))] <- 0
pcp_claims$CombatFlag <- (pcp_claims$Combat1010EZFlag=="Y")
pcp_claims$CombatFlag[which(is.na(pcp_claims$CombatFlag))] <- 0

pcp_claims <- merge(x=pcp_claims, y=recent_enrollment[,c("PatientICN", "EnrollmentPriority", "AnnualVACheckAmount", "ExposedToAgentOrangeFlag", "RadiationExposureIndicatedFlag", "UnemployableFlag")], by.x="PatientICN", by.y="PatientICN", all.x=T)
rm(recent_enrollment)


### Diagnosis:
pcp_claims$ICD9Code[which(pcp_claims$ICD9Code=="*Unknown at this time*")] <- NA
pcp_claims$ICD9Code <- gsub(".", "", pcp_claims$ICD9Code, fixed=TRUE)
pcp_claims$ICD10Code[which(pcp_claims$ICD10Code=="*Unknown at this time*")] <- NA
pcp_claims$ICD10Code <- as.character(pcp_claims$ICD10Code)
pcp_claims$ICD10Code <- gsub(".", "", pcp_claims$ICD10Code, fixed=TRUE)
pcp_claims$DRG <- substring(pcp_claims$ICD9Code, 1, 3)

icd_cw <- read_csv("O:/OMHSP_PERCAnalyt/Upload/icd10cmtoicd9gem.csv")
icd_cw <- icd_cw[!duplicated(icd_cw[,c("icd10cm")]),] # just taking a random crosswalk if there are multiple ICD9 to an ICD10
pcp_claims <- merge(x=pcp_claims, y=icd_cw[c("icd9cm", "icd10cm")], by.x="ICD10Code", by.y="icd10cm", all.x=T)
pcp_claims$ICD9Code[which(!is.na(pcp_claims$ICD10Code))] <- pcp_claims$icd9cm[which(!is.na(pcp_claims$ICD10Code))]
rm(icd_cw)

pcp_claims$StaffSSN <- as.factor(pcp_claims$StaffSSN)
pcp_claims$RelationshipStartDate <- as.Date(pcp_claims$RelationshipStartDate, "%Y-%m-%d")
pcp_claims$RelationshipEndDate <- as.Date(pcp_claims$RelationshipEndDate, "%Y-%m-%d")
pcp_claims$relationshipLength <- as.numeric(pcp_claims$RelationshipEndDate-pcp_claims$RelationshipStartDate)
pcp_claims <- data.table(pcp_claims)

pcp_claims$DayOfWeek <- weekdays(pcp_claims$VisitDate)
pcp_claims$Month <- format(pcp_claims$VisitDate, "%m")
pcp_claims$VisitYear <- format(pcp_claims$VisitDate, "%Y")
pcp_claims$YearXMonth <- interaction(pcp_claims$VisitYear, pcp_claims$Month)

pcp_claims$AssignYear <- format(pcp_claims$RelationshipStartDate, "%Y")
pcp_claims$BirthDateTime <- as.Date(pcp_claims$BirthDateTime, "%Y-%m-%d")
pcp_claims <- pcp_claims[RelationshipStartDate>BirthDateTime]

pcp_claims <- pcp_claims[, Age := as.integer(substring(as.period(interval(BirthDateTime, VisitDate), unit="years"), 0, 2))]
pcp_claims <- pcp_claims[Age>=20 & Age<90]
pcp_claims$Age_bins <- "0"
pcp_claims$Age_bins[which(pcp_claims$Age>=25& pcp_claims$Age<30)] <- "1"
pcp_claims$Age_bins[which(pcp_claims$Age>=30& pcp_claims$Age<35)] <- "2"
pcp_claims$Age_bins[which(pcp_claims$Age>=35& pcp_claims$Age<40)] <- "3"
pcp_claims$Age_bins[which(pcp_claims$Age>=40& pcp_claims$Age<45)] <- "4"
pcp_claims$Age_bins[which(pcp_claims$Age>=45& pcp_claims$Age<50)] <- "5"
pcp_claims$Age_bins[which(pcp_claims$Age>=50& pcp_claims$Age<55)] <- "6"
pcp_claims$Age_bins[which(pcp_claims$Age>=55& pcp_claims$Age<60)] <- "7"
pcp_claims$Age_bins[which(pcp_claims$Age>=60& pcp_claims$Age<65)] <- "8"
pcp_claims$Age_bins[which(pcp_claims$Age>=65& pcp_claims$Age<70)] <- "9"
pcp_claims$Age_bins[which(pcp_claims$Age>=70& pcp_claims$Age<75)] <- "10"
pcp_claims$Age_bins[which(pcp_claims$Age>=75& pcp_claims$Age<80)] <- "11"
pcp_claims$Age_bins[which(pcp_claims$Age>=80& pcp_claims$Age<85)] <- "12"
pcp_claims$Age_bins[which(pcp_claims$Age>=85& pcp_claims$Age<90)] <- "13"

ethnicity <- sqlQuery(db.handle, "
SELECT DISTINCT cohort.PatientICN, Ethnicity FROM CDWWork.PatSub.PatientEthnicity AS a
INNER JOIN CDWWork.SPatient.SPatient AS b
  ON a.PatientSID=b.PatientSID
INNER JOIN OMHO_QFR.ECON.appointments AS cohort
  ON b.PatientICN=cohort.PatientICN
")

pcp_claims$RaceEthnicity <- NA
pcp_claims$RaceEthnicity[which(pcp_claims$Race %in% c("AMERICAN INDIAN OR ALASKA NATIVE"))] <- "Native"
pcp_claims$RaceEthnicity[which(pcp_claims$Race %in% c("ASIAN", "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER"))] <- "API"
pcp_claims$RaceEthnicity[which(pcp_claims$Race %in% c("BLACK OR AFRICAN AMERICAN"))] <- "Black"
pcp_claims$RaceEthnicity[which(pcp_claims$Race %in% c("WHITE", "WHITE NOT OF HISP ORIG"))] <- "White"
pcp_claims$RaceEthnicity[which(pcp_claims$PatientICN %in% ethnicity$PatientICN[which(ethnicity$Ethnicity=="HISPANIC OR LATINO")])] <- "Hispanic"
pcp_claims$RaceEthnicity[which(is.na(pcp_claims$RaceEthnicity))] <- "Unknown"
rm(ethnicity)

# Get CMS Flag
cms <- sqlQuery(odbcDriverConnect('server=vhacdwrb02.vha.med.va.gov; database=CDWWork; driver={SQL server}; trusted_connection=true'), "
SELECT SCRSSN,
  CMS_IND04, CMS_IND05, CMS_IND06, CMS_IND07, CMS_IND08, CMS_IND09, CMS_IND10, CMS_IND11, CMS_IND12, CMS_IND13, CMS_IND14, CMS_IND15, CMS_IND16, CMS_IND17
FROM CDWWork.VitalStatus.MASTER 
WHERE CMS_IND04=2 OR CMS_IND05=2 OR CMS_IND06=2 OR CMS_IND07=2 OR CMS_IND08=2 OR CMS_IND09=2 OR CMS_IND10=2 OR CMS_IND11=2 OR CMS_IND12=2 OR CMS_IND13=2 OR CMS_IND14=2 OR CMS_IND15=2 OR CMS_IND16=2 OR CMS_IND17=2")

pcp_claims$cms_flag <- 0
pcp_claims$cms_prior <- 0
pcp_claims$VisitYear <- format(pcp_claims$VisitDate, "%Y")
# Drop before 2005
pcp_claims <- pcp_claims[which(pcp_claims$VisitYear>=2005),]
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2005" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND05==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2006" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND06==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2007" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND07==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2008" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND08==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2009" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND09==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2010" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND10==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2011" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND11==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2012" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND12==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2013" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND13==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2014" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND14==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2015" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND15==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2016" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND16==2)])] <- 1
pcp_claims$cms_flag[which(pcp_claims$VisitYear=="2017" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND17==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2005" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND04==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2006" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND05==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2007" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND06==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2008" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND07==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2009" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND08==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2010" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND09==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2011" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND10==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2012" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND11==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2013" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND12==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2014" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND13==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2015" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND14==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2016" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND15==2)])] <- 1
pcp_claims$cms_prior[which(pcp_claims$VisitYear=="2017" & pcp_claims$ScrSSN %in% cms$SCRSSN[which(cms$CMS_IND16==2)])] <- 1
rm(cms)

pcp_claims$Medicare <- (pcp_claims$cms_flag==1 & pcp_claims$Age>=65)
pcp_claims$Medicaid <- (pcp_claims$cms_flag==1 & pcp_claims$Age<65)

pcp_claims$Black <- 0
pcp_claims$Black[which(pcp_claims$Race=="BLACK OR AFRICAN AMERICAN")] <- 1

pcp_claims$Female <- 1
pcp_claims$Female[which(pcp_claims$Gender=="M")] <- 0

pcp_claims$Married <- NA
pcp_claims$Married[which(pcp_claims$MaritalStatus=="MARRIED")] <- "1"
pcp_claims$Married[which(pcp_claims$MaritalStatus %in% c("DIVORCED", "SEPARATED", "WIDOW/WIDOWER", "WIDOWED"))] <- "2"
pcp_claims$Married[which(pcp_claims$MaritalStatus=="NEVER MARRIED")] <- "3"
pcp_claims$Married[which(is.na(pcp_claims$Married))] <- "4"

# Clean up income
pcp_claims$PatientIncome[which(pcp_claims$PatientIncome>=100000)] <- 100000
cpi <- data.frame(Year=as.character(2005:2019), cpi=c(64.85, 67.45, 70.44, 73.05, 75.36, 77.93, 80.30, 83.25, 85.30, 87.34, 89.64, 93.03, 95.37, 97.45, 100))
pcp_claims <- merge(x=pcp_claims, y=cpi, by.x="VisitYear", by.y="Year", all.x=T)
pcp_claims$PatientIncome <- pcp_claims$PatientIncome*100/pcp_claims$cpi


pcp_claims$PeriodOfService <- "Other"
pcp_claims$PeriodOfService[which(pcp_claims$PatientPeriodOfService=="7")] <- "Vietnam"
pcp_claims$PeriodOfService[which(pcp_claims$PatientPeriodOfService=="X")] <- "Gulf"
pcp_claims$PeriodOfService[which(pcp_claims$PatientPeriodOfService=="8")] <- "Post-Vietnam"
pcp_claims$PeriodOfService[which(pcp_claims$PatientPeriodOfService=="0")] <- "Korean"
pcp_claims$PeriodOfService[which(pcp_claims$PatientPeriodOfService=="2")] <- "WW2"
pcp_claims$PeriodOfService[which(pcp_claims$PatientPeriodOfService=="5")] <- "Post-Korean"
pcp_claims$PeriodOfService[which(is.na(pcp_claims$PatientPeriodOfService))] <- NA
pcp_claims$PeriodOfService <- as.factor(pcp_claims$PeriodOfService)
pcp_claims$PeriodOfService <- relevel(pcp_claims$PeriodOfService, "Gulf")

pcp_claims$ServiceConnectedFlag <- (pcp_claims$ServiceConnectedFlag=="Y")
pcp_claims$AnnualVACheckAmmount[which(is.na(pcp_claims$AnnualVACheckAmmount))] <- 0
pcp_claims$AnnualVACheckAmmount[which(pcp_claims$AnnualVACheckAmmount>=50000)] <- 50000
pcp_claims$AnnualVACheckAmmount <- pcp_claims$AnnualVACheckAmmount*100/pcp_claims$cpi

# CCS Single-level
ccs_icd9<- read_csv("O:/OMHSP_PERCAnalyt/Upload/icd9_ccs_category.csv")[,1:2]
ccs_icd10 <- read_csv("O:/OMHSP_PERCAnalyt/Upload/icd10_ccs_category.csv")[,1:2]
names(ccs_icd9) <- c("ICD9Code", "CCS")
names(ccs_icd10) <- c("ICD10Code", "CCS")
ccs_icd9$ICD9Code <- gsub("'", "", ccs_icd9$ICD9Code, fixed=TRUE)
ccs_icd9$ICD9Code <- gsub(" ", "", ccs_icd9$ICD9Code, fixed=TRUE)
ccs_icd9$CCS <- gsub("'", "", ccs_icd9$CCS, fixed=TRUE)
ccs_icd9$CCS <- gsub(" ", "", ccs_icd9$CCS, fixed=TRUE)
ccs_icd10$ICD10Code <- gsub("'", "", ccs_icd10$ICD10Code, fixed=TRUE)
ccs_icd10$ICD10Code <- gsub(" ", "", ccs_icd10$ICD10Code, fixed=TRUE)
ccs_icd10$CCS <- gsub("'", "", ccs_icd10$CCS, fixed=TRUE)
ccs_icd10$CCS <- gsub(" ", "", ccs_icd10$CCS, fixed=TRUE)

pcp_claims <- merge(x=pcp_claims, y=ccs_icd9, by.x="ICD9Code", by.y="ICD9Code", all.x=T)
pcp_claims <- merge(x=pcp_claims, y=ccs_icd10, by.x="ICD10Code", by.y="ICD10Code", all.x=T)
pcp_claims$CCS <- pcp_claims$CCS.x
pcp_claims$CCS[which(is.na(pcp_claims$CCS))] <- pcp_claims$CCS.x[which(is.na(pcp_claims$CCS))]
pcp_claims <- subset(pcp_claims, select=-c(CCS.x, CCS.y))
ccs_icd9<- read_csv("O:/OMHSP_PERCAnalyt/Upload/icd9_ccs_category.csv")[,2:3]
names(ccs_icd9) <- c("CCS", "CCS_Description")
ccs_icd9$CCS <- gsub("'", "", ccs_icd9$CCS, fixed=TRUE)
ccs_icd9$CCS <- gsub(" ", "", ccs_icd9$CCS, fixed=TRUE)
ccs_icd9$CCS_Description <- gsub("'", "", ccs_icd9$CCS_Description, fixed=TRUE)
ccs_icd9 <- ccs_icd9[-1,]
ccs_icd9 <- ccs_icd9[!duplicated(ccs_icd9),]
pcp_claims <- merge(x=pcp_claims, y=ccs_icd9, by.x="CCS", by.y="CCS")

# CCS Multi-level 1 (MDC)
ccs_icd10 <- read_csv("O:/OMHSP_PERCAnalyt/Upload/icd10_ccs_category.csv")[,c(2,6)]
names(ccs_icd10) <- c("CCS", "MDC")
ccs_icd10$CCS <- gsub("'", "", ccs_icd10$CCS, fixed=TRUE)
ccs_icd10$CCS <- gsub(" ", "", ccs_icd10$CCS, fixed=TRUE)
ccs_icd10 <- ccs_icd10[!duplicated(ccs_icd10),]
pcp_claims <- merge(x=pcp_claims, y=ccs_icd10, by.x="CCS", by.y="CCS")
pcp_claims <- pcp_claims[!MDC %in% c("Certain conditions originating in the perinatal period")]
pcp_claims$MDC <- factor(pcp_claims$MDC)
pcp_claims$MDC <- relevel(pcp_claims$MDC, "Symptoms; signs; and ill-defined conditions and factors influencing health status")

rm(ccs_icd9, ccs_icd10)

# Only keep visits that occur between 8-5 (99% of claims)
pcp_claims$Hour <- as.numeric(substring(pcp_claims$VisitDateTime, 12, 13)) + as.numeric(substring(pcp_claims$VisitDateTime, 15, 16))/60
pcp_claims <- data.table(pcp_claims)
pcp_claims <- pcp_claims[Hour>=7 & Hour<=18]
pcp_claims$Hour <- round(pcp_claims$Hour)

# Remove Saturday and Sunday
pcp_claims <- pcp_claims[!DayOfWeek=="Sunday"]


# Merge in date of death
vitalstatus <- sqlQuery(odbcDriverConnect('server=vhacdwrb02.vha.med.va.gov; database=CDWWork; driver={SQL server}; trusted_connection=true'),
                        "SELECT SCRSSN, DOB, DOD FROM CDWWork.VitalStatus.Mini WHERE DOD_IND='1'")

pcp_claims <- merge(x=pcp_claims, y=vitalstatus[c("SCRSSN", "DOD")], by.x="ScrSSN", by.y="SCRSSN", all.x=T)
pcp_claims$DOD <- as.Date(substring(pcp_claims$DOD, 1, 10))
pcp_claims <- pcp_claims[DOD>VisitDate | is.na(DOD)] # omit those that die the day of or before PCP visit
pcp_claims$DeathRelativeMonth <- as.numeric(ceiling((pcp_claims$DOD-pcp_claims$VisitDate)/30))
pcp_claims$DeathRelativeMonth[which(is.na(pcp_claims$DeathRelativeMonth))] <- Inf
rm(vitalstatus)

pcp_claims$Death1Year <- (pcp_claims$DeathRelativeMonth<=12)
pcp_claims$Death2Year <- (pcp_claims$DeathRelativeMonth<=24)
pcp_claims$Death3Year <- (pcp_claims$DeathRelativeMonth<=36)

# Stop codes:
pcp_claims$StopCode <- paste(pcp_claims$StopCode1, pcp_claims$StopCode2, sep="")
pcp_claims <- pcp_claims[StopCode1 %in% c("323", "322", "301", "348") & !StopCode2 %in% c("125", "130", "131")]

pcp_claims$PeriodOfService[which(pcp_claims$PeriodOfService==""|is.na(pcp_claims$PeriodOfService))] <- "Other"
pcp_claims$EnrollmentPriority <- as.character(pcp_claims$EnrollmentPriority)
pcp_claims$EnrollmentPriority[which(is.na(pcp_claims$EnrollmentPriority))] <- "Other"
pcp_claims$ExposedToAgentOrangeFlag[which(is.na(pcp_claims$ExposedToAgentOrangeFlag))] <- 0

# Only keep large facilities with at least 2 PCP per year
pcp_claims <- pcp_claims[, `:=` (N=length(unique(PatientICN))), by="InstitutionSID"]
pcp_claims <- pcp_claims[InstitutionSID>0 & N>200]
pcp_claims <- pcp_claims[, `:=` (N_Patients=length(unique(PatientICN))), by="StaffSSN"]

pcp_claims$facility_year <- interaction(pcp_claims$InstitutionSID, pcp_claims$VisitYear)
pcp_claims <- pcp_claims[, `:=` (N=length(unique(StaffSSN))), by="facility_year"]
pcp_claims <- pcp_claims[N>=2]

# Replace missing outcomes with zero since dummies
pcp_claims <- data.frame(pcp_claims)
replace_index <- which(names(pcp_claims) %in% c("MH_dummy_HOSP_lag1", "MH_count_HOSP_lag1", "MH_dummy_HOSP_cum3", "MH_count_HOSP_cum3", "PH_dummy_HOSP_lag1", "PH_count_HOSP_lag1", "PH_dummy_HOSP_cum3", "PH_count_HOSP_cum3", 
                                                "MH_dummy_ED_lag1", "MH_count_ED_lag1", "MH_dummy_ED_cum3", "MH_count_ED_cum3", "PH_dummy_ED_lag1", "PH_count_ED_lag1", "PH_dummy_ED_cum3", "PH_count_ED_cum3", 
                                                "MH_dummy_UCC_lag1", "MH_count_UCC_lag1", "MH_dummy_UCC_cum3", "MH_count_UCC_cum3", "PH_dummy_UCC_lag1", "PH_count_UCC_lag1", "PH_dummy_UCC_cum3", "PH_count_UCC_cum3", 
                                                "MH_dummy_COMMHOSP_lag1", "MH_count_COMMHOSP_lag1", "MH_dummy_COMMHOSP_cum3", "MH_count_COMMHOSP_cum3", "PH_dummy_COMMHOSP_lag1", "PH_count_COMMHOSP_lag1", "PH_dummy_COMMHOSP_cum3", "PH_count_COMMHOSP_cum3", 
                                                "MH_dummy_COMMED_lag1", "MH_count_COMMED_lag1", "MH_dummy_COMMED_cum3", "MH_count_COMMED_cum3", "PH_dummy_COMMED_lag1", "PH_count_COMMED_lag1", "PH_dummy_COMMED_cum3", "PH_count_COMMED_cum3", 
                                                "MH_dummy_COMMUCC_lag1", "MH_count_COMMUCC_lag1", "MH_dummy_COMMUCC_cum3", "MH_count_COMMUCC_cum3", "PH_dummy_COMMUCC_lag1", "PH_count_COMMUCC_lag1", "PH_dummy_COMMUCC_cum3", "PH_count_COMMUCC_cum3", 
                                                "MH_dummy_CMSHOSP_lag1", "MH_count_CMSHOSP_lag1", "MH_dummy_CMSHOSP_cum3", "MH_count_CMSHOSP_cum3", "PH_dummy_CMSHOSP_lag1", "PH_count_CMSHOSP_lag1", "PH_dummy_CMSHOSP_cum3", "PH_count_CMSHOSP_cum3",
                                                "MH_dummy_CMSED_lag1", "MH_count_CMSED_lag1", "MH_dummy_CMSED_cum3", "MH_count_CMSED_cum3", "PH_dummy_CMSED_lag1", "PH_count_CMSED_lag1", "PH_dummy_CMSED_cum3", "PH_count_CMSED_cum3", 
                                                "MH_dummy_CMSUCC_lag1", "MH_count_CMSUCC_lag1", "MH_dummy_CMSUCC_cum3", "MH_count_CMSUCC_cum3", "PH_dummy_CMSUCC_lag1", "PH_count_CMSUCC_lag1", "PH_dummy_CMSUCC_cum3", "PH_count_CMSUCC_cum3"))

pcp_claims[replace_index][is.na(pcp_claims[replace_index])] <- 0
rm(replace_index)
pcp_claims <- data.table(pcp_claims)


# Top-code MH_count and PH_count at 10 per year:
pcp_claims <- data.frame(pcp_claims)
replace_index_lag1 <- which(names(pcp_claims) %in% c("MH_dummy_HOSP_lag1", "MH_count_HOSP_lag1", "PH_dummy_HOSP_lag1", "PH_count_HOSP_lag1", 
                                                     "MH_dummy_ED_lag1", "MH_count_ED_lag1", "PH_dummy_ED_lag1", "PH_count_ED_lag1", 
                                                     "MH_dummy_UCC_lag1", "MH_count_UCC_lag1", "PH_dummy_UCC_lag1", "PH_count_UCC_lag1",
                                                     "MH_dummy_COMMHOSP_lag1", "MH_count_COMMHOSP_lag1", "PH_dummy_COMMHOSP_lag1", "PH_count_COMMHOSP_lag1", 
                                                     "MH_dummy_COMMED_lag1", "MH_count_COMMED_lag1", "PH_dummy_COMMED_lag1", "PH_count_COMMED_lag1", 
                                                     "MH_dummy_COMMUCC_lag1", "MH_count_COMMUCC_lag1", "PH_dummy_COMMUCC_lag1", "PH_count_COMMUCC_lag1", 
                                                     "MH_dummy_CMSHOSP_lag1", "MH_count_CMSHOSP_lag1", "PH_dummy_CMSHOSP_lag1", "PH_count_CMSHOSP_lag1",
                                                     "MH_dummy_CMSED_lag1", "MH_count_CMSED_lag1", "PH_dummy_CMSED_lag1", "PH_count_CMSED_lag1",
                                                     "MH_dummy_CMSUCC_lag1", "MH_count_CMSUCC_lag1", "PH_dummy_CMSUCC_lag1", "PH_count_CMSUCC_lag1"))
pcp_claims[replace_index_lag1][(pcp_claims[replace_index_lag1]>=10)] <- 10
replace_index_cum3 <- which(names(pcp_claims) %in% c("MH_dummy_HOSP_cum3", "MH_count_HOSP_cum3", "PH_dummy_HOSP_cum3", "PH_count_HOSP_cum3", 
                                                     "MH_dummy_ED_cum3", "MH_count_ED_cum3", "PH_dummy_ED_cum3", "PH_count_ED_cum3", 
                                                     "MH_dummy_UCC_cum3", "MH_count_UCC_cum3", "PH_dummy_UCC_cum3", "PH_count_UCC_cum3",
                                                     "MH_dummy_COMMHOSP_cum3", "MH_count_COMMHOSP_cum3", "PH_dummy_COMMHOSP_cum3", "PH_count_COMMHOSP_cum3", 
                                                     "MH_dummy_COMMED_cum3", "MH_count_COMMED_cum3", "PH_dummy_COMMED_cum3", "PH_count_COMMED_cum3", 
                                                     "MH_dummy_COMMUCC_cum3", "MH_count_COMMUCC_cum3", "PH_dummy_COMMUCC_cum3", "PH_count_COMMUCC_cum3", 
                                                     "MH_dummy_CMSHOSP_cum3", "MH_count_CMSHOSP_cum3", "PH_dummy_CMSHOSP_cum3", "PH_count_CMSHOSP_cum3",
                                                     "MH_dummy_CMSED_cum3", "MH_count_CMSED_cum3", "PH_dummy_CMSED_cum3", "PH_count_CMSED_cum3",
                                                     "MH_dummy_CMSUCC_cum3", "MH_count_CMSUCC_cum3", "PH_dummy_CMSUCC_cum3", "PH_count_CMSUCC_cum3"))
pcp_claims[replace_index_cum3][(pcp_claims[replace_index_cum3]>=30)] <- 30
rm(replace_index_lag1, replace_index_cum3)


pcp_claims$TimeOut[which(pcp_claims$TimeOut<0)] <- NA
pcp_claims$TimeOut[which(pcp_claims$TimeOut>=90)] <- 90
pcp_claims$TimeOut_bins <- NA
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut==0)] <- "1"
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut>=1 & pcp_claims$TimeOut<=7)] <- "2"
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut>=8 & pcp_claims$TimeOut<=14)] <- "3"
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut>=15 & pcp_claims$TimeOut<=21)] <- "4"
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut>=22 & pcp_claims$TimeOut<=30)] <- "5"
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut>30 & pcp_claims$TimeOut<=60)] <- "6"
pcp_claims$TimeOut_bins[which(pcp_claims$TimeOut>60)] <- "7"

# V1: VA Hosp and ED ONLY
pcp_claims$MH_dummy_lag1_v1 <- pmax(pcp_claims$MH_dummy_HOSP_lag1, pcp_claims$MH_dummy_ED_lag1)
pcp_claims$PH_dummy_lag1_v1 <- pmax(pcp_claims$PH_dummy_HOSP_lag1, pcp_claims$PH_dummy_ED_lag1)
pcp_claims$MH_dummy_cum3_v1 <- pmax(pcp_claims$MH_dummy_HOSP_cum3, pcp_claims$MH_dummy_ED_cum3)
pcp_claims$PH_dummy_cum3_v1 <- pmax(pcp_claims$PH_dummy_HOSP_cum3, pcp_claims$PH_dummy_ED_cum3)

pcp_claims$MH_count_lag1_v1 <- pcp_claims$MH_count_HOSP_lag1 + pcp_claims$MH_count_ED_lag1
pcp_claims$PH_count_lag1_v1 <- pcp_claims$PH_count_HOSP_lag1 + pcp_claims$PH_count_ED_lag1
pcp_claims$MH_count_cum3_v1 <- pcp_claims$MH_count_HOSP_cum3 + pcp_claims$MH_count_ED_cum3
pcp_claims$PH_count_cum3_v1 <- pcp_claims$PH_count_HOSP_cum3 + pcp_claims$PH_count_ED_cum3

# V3: VA, PIT, and CMS HOSP and ED (no UCC)
pcp_claims$MH_dummy_lag1_v3 <- pmax(pcp_claims$MH_dummy_HOSP_lag1, pcp_claims$MH_dummy_ED_lag1, pcp_claims$MH_dummy_COMMHOSP_lag1, pcp_claims$MH_dummy_COMMED_lag1, pcp_claims$MH_dummy_CMSHOSP_lag1, pcp_claims$MH_dummy_CMSED_lag1)
pcp_claims$PH_dummy_lag1_v3 <- pmax(pcp_claims$PH_dummy_HOSP_lag1, pcp_claims$PH_dummy_ED_lag1, pcp_claims$PH_dummy_COMMHOSP_lag1, pcp_claims$PH_dummy_COMMED_lag1, pcp_claims$PH_dummy_CMSHOSP_lag1, pcp_claims$PH_dummy_CMSED_lag1)
pcp_claims$MH_dummy_cum3_v3 <- pmax(pcp_claims$MH_dummy_HOSP_cum3, pcp_claims$MH_dummy_ED_cum3, pcp_claims$MH_dummy_COMMHOSP_cum3, pcp_claims$MH_dummy_COMMED_cum3, pcp_claims$MH_dummy_CMSHOSP_cum3, pcp_claims$MH_dummy_CMSED_cum3)
pcp_claims$PH_dummy_cum3_v3 <- pmax(pcp_claims$PH_dummy_HOSP_cum3, pcp_claims$PH_dummy_ED_cum3, pcp_claims$PH_dummy_COMMHOSP_cum3, pcp_claims$PH_dummy_COMMED_cum3, pcp_claims$PH_dummy_CMSHOSP_cum3, pcp_claims$PH_dummy_CMSED_cum3)

pcp_claims$MH_count_lag1_v3 <- pcp_claims$MH_count_HOSP_lag1 + pcp_claims$MH_count_ED_lag1 + pcp_claims$MH_count_COMMHOSP_lag1 + pcp_claims$MH_count_COMMED_lag1 + pcp_claims$MH_count_CMSHOSP_lag1 + pcp_claims$MH_count_CMSED_lag1
pcp_claims$PH_count_lag1_v3 <- pcp_claims$PH_count_HOSP_lag1 + pcp_claims$PH_count_ED_lag1 + pcp_claims$PH_count_COMMHOSP_lag1 + pcp_claims$PH_count_COMMED_lag1 + pcp_claims$PH_count_CMSHOSP_lag1 + pcp_claims$PH_count_CMSED_lag1
pcp_claims$MH_count_cum3_v3 <- pcp_claims$MH_count_HOSP_cum3 + pcp_claims$MH_count_ED_cum3 + pcp_claims$MH_count_COMMHOSP_cum3 + pcp_claims$MH_count_COMMED_cum3 + pcp_claims$MH_count_CMSHOSP_cum3 + pcp_claims$MH_count_CMSED_cum3
pcp_claims$PH_count_cum3_v3 <- pcp_claims$PH_count_HOSP_cum3 + pcp_claims$PH_count_ED_cum3 + pcp_claims$PH_count_COMMHOSP_cum3 + pcp_claims$PH_count_COMMED_cum3 + pcp_claims$PH_count_CMSHOSP_cum3 + pcp_claims$PH_count_CMSED_cum3

pcp_claims <- data.frame(pcp_claims)
replace_index_lag1 <- which(names(pcp_claims) %in% c("MH_count_lag1_v1", "PH_count_lag1_v1", "MH_count_lag1_v3", "PH_count_lag1_v3"))
pcp_claims[replace_index_lag1][(pcp_claims[replace_index_lag1]>=10)] <- 10
replace_index_cum3 <- which(names(pcp_claims) %in% c("MH_count_cum3_v1", "PH_count_cum3_v1", "MH_count_cum3_v3", "PH_count_cum3_v3"))
pcp_claims[replace_index_cum3][(pcp_claims[replace_index_cum3]>=30)] <- 30
rm(replace_index_lag1, replace_index_cum3)
pcp_claims <- data.table(pcp_claims)


# Remove patients who have seen their PCP prior to request date:
pcp_claims$StaffSSN <- as.character(pcp_claims$StaffSSN)
pcp_claims$PV_StaffSSN <- as.character(pcp_claims$PV_StaffSSN)
pcp_claims <- pcp_claims[!PV_StaffSSN==StaffSSN | is.na(PV_StaffSSN)]

pcp_claims <- pcp_claims[!is.na(DesiredDayOfWeek) & !is.na(TimeOut_bins)]

pcp_claims <- unique(pcp_claims, by="PatientICN")

####################################################################################
#####################      Merge in HERC Cost and NOSOS risk data:
####################################################################################
# Quarter is null to be consistent because 2014 and prior only used quarter=null
# query takes 30mins
pcp_claims$VisitDate <- as.Date(pcp_claims$VisitDate)
pcp_claims$VisitFY <- NA
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2004-10-01") & pcp_claims$VisitDate<as.Date("2005-10-01"))] <- 2005
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2005-10-01") & pcp_claims$VisitDate<as.Date("2006-10-01"))] <- 2006
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2006-10-01") & pcp_claims$VisitDate<as.Date("2007-10-01"))] <- 2007
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2007-10-01") & pcp_claims$VisitDate<as.Date("2008-10-01"))] <- 2008
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2008-10-01") & pcp_claims$VisitDate<as.Date("2009-10-01"))] <- 2009
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2009-10-01") & pcp_claims$VisitDate<as.Date("2010-10-01"))] <- 2010
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2010-10-01") & pcp_claims$VisitDate<as.Date("2011-10-01"))] <- 2011
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2011-10-01") & pcp_claims$VisitDate<as.Date("2012-10-01"))] <- 2012
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2012-10-01") & pcp_claims$VisitDate<as.Date("2013-10-01"))] <- 2013
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2013-10-01") & pcp_claims$VisitDate<as.Date("2014-10-01"))] <- 2014
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2014-10-01") & pcp_claims$VisitDate<as.Date("2015-10-01"))] <- 2015
pcp_claims$VisitFY[which(pcp_claims$VisitDate>=as.Date("2015-10-01") & pcp_claims$VisitDate<as.Date("2016-10-01"))] <- 2016


avgcost_out <- sqlQuery(odbcDriverConnect('server=vhacdwrb02.vha.med.va.gov; database=CDWWork; driver={SQL server}; trusted_connection=true'),
                        "SELECT PatientICN, year, SUM(costl) AS out_cost FROM VINCI_HERC.HERC.OPCSE WHERE year>=2006 GROUP BY PatientICN, year")
avgcost_out <- avgcost_out[which(avgcost_out$PatientICN %in% pcp_claims$PatientICN),]

avgcost_inp <- sqlQuery(odbcDriverConnect('server=vhacdwrb02.vha.med.va.gov; database=CDWWork; driver={SQL server}; trusted_connection=true'),
                        "SELECT PatientICN, year, SUM(costl) AS inp_cost FROM VINCI_HERC.HERC.DISCHG WHERE year>=2006 GROUP BY PatientICN, year")
avgcost_inp <- avgcost_inp[which(avgcost_inp$PatientICN %in% pcp_claims$PatientICN),]

herc <- merge(x=avgcost_out, y=avgcost_inp, by.x=c("PatientICN", "year"), by.y=c("PatientICN", "year"), all.x=T, all.y=T)
herc$out_cost[which(is.na(herc$out_cost))] <- 0
herc$inp_cost[which(is.na(herc$inp_cost))] <- 0
herc$avgcost <- herc$out_cost+herc$inp_cost
herc <- merge(x=herc, y=pcp_claims[,c("PatientICN", "VisitFY")], by.x="PatientICN", by.y="PatientICN", all.x=T)

herc_yr1 <- herc[which(herc$year==(herc$VisitFY+1)),]
herc_yr2 <- herc[which(herc$year==(herc$VisitFY+2)),]
herc_yr3 <- herc[which(herc$year==(herc$VisitFY+3)),]

herc_yr1$AvgCost_yr1 <- herc_yr1$avgcost
herc_yr2$AvgCost_yr2 <- herc_yr2$avgcost
herc_yr3$AvgCost_yr3 <- herc_yr3$avgcost

pcp_claims <- merge(x=pcp_claims, y=herc_yr1[,c("PatientICN", "AvgCost_yr1")], by.x="PatientICN", by.y="PatientICN", all.x=T)
pcp_claims <- merge(x=pcp_claims, y=herc_yr2[,c("PatientICN", "AvgCost_yr2")], by.x="PatientICN", by.y="PatientICN", all.x=T)
pcp_claims <- merge(x=pcp_claims, y=herc_yr3[,c("PatientICN", "AvgCost_yr3")], by.x="PatientICN", by.y="PatientICN", all.x=T)
pcp_claims$AvgCost_yr1[which(is.na(pcp_claims$AvgCost_yr1))] <- 0
pcp_claims$AvgCost_yr2[which(is.na(pcp_claims$AvgCost_yr2))] <- 0
pcp_claims$AvgCost_yr3[which(is.na(pcp_claims$AvgCost_yr3))] <- 0

pcp_claims$herc_yr1 <- pcp_claims$AvgCost_yr1
pcp_claims$herc_cum2 <- pcp_claims$AvgCost_yr1+pcp_claims$AvgCost_yr2
pcp_claims$herc_cum3 <- pcp_claims$AvgCost_yr1+pcp_claims$AvgCost_yr2+pcp_claims$AvgCost_yr3
rm(herc, herc_yr1, herc_yr2, herc_yr3, avgcost_inp, avgcost_out)


###########################################
##########         ACSC          ##########
###########################################

ACSC <- sqlQuery(db.handle, "set nocount on; set ansi_warnings off; SELECT * FROM OMHO_QFR.ECON.appointments_acsc")

ACSC$ACSCDate <- as.Date(ACSC$ACSCDate, "%Y-%m-%d")
ACSC <- merge(x=ACSC, y=pcp_claims[,c("PatientICN", "AppointmentRequestDate")], by.x="PatientICN", by.y="PatientICN")
ACSC$Post <- (ACSC$ACSCDate>ACSC$AppointmentRequestDate)

pcp_claims$ACSC_cum3 <- (pcp_claims$PatientICN %in% ACSC$PatientICN[which(ACSC$Post==1)])
pcp_claims$ACSC_lag1 <- (pcp_claims$PatientICN %in% ACSC$PatientICN[which(ACSC$Post==0)])


##############################################################
##########    Create Value Add (valueAdd table)     ##########
##############################################################

pcp_claims <- pcp_claims[!StaffSSN %in% c("", "*Missing*") & !is.na(StaffSSN)]

pcp_claims <- pcp_claims[, `:=` (TotalCaseCount=length(PatientICN)), by="StaffSSN"]
pcp_claims <- pcp_claims[TotalCaseCount>=20]

pcp_claims$RadiationExposureIndicatedFlag[which(is.na(pcp_claims$RadiationExposureIndicatedFlag))] <- 0
pcp_claims$AnnualVACheckAmount[which(is.na(pcp_claims$AnnualVACheckAmount))] <- 0
pcp_claims$UnemployableFlag[which(is.na(pcp_claims$UnemployableFlag))] <- 0

pcp_claims$residual_MH <- resid(felm(MH_dummy_cum3_v1~Black+Medicare+Medicaid+PH_dummy_lag1_v1+MH_dummy_lag1_v1+ACSC_lag1|Age_bins+Married+EnrollmentPriority+PeriodOfService+ServiceConnectedFlag+UnemployableFlag+ExposedToAgentOrangeFlag+RadiationExposureIndicatedFlag+AnnualVACheckAmount+InstitutionSID+YearXMonth+DayOfWeek+TimeOut_bins, data=pcp_claims))
pcp_claims$residual_PH <- resid(felm(PH_dummy_cum3_v1~Black+Medicare+Medicaid+PH_dummy_lag1_v1+MH_dummy_lag1_v1+ACSC_lag1|Age_bins+Married+EnrollmentPriority+PeriodOfService+ServiceConnectedFlag+UnemployableFlag+ExposedToAgentOrangeFlag+RadiationExposureIndicatedFlag+AnnualVACheckAmount+InstitutionSID+YearXMonth+DayOfWeek+TimeOut_bins, data=pcp_claims))
pcp_claims$residual_ACSC <- resid(felm(ACSC_cum3~Black+Medicare+Medicaid+PH_dummy_lag1_v1+MH_dummy_lag1_v1+ACSC_lag1|Age_bins+Married+EnrollmentPriority+PeriodOfService+ServiceConnectedFlag+UnemployableFlag+ExposedToAgentOrangeFlag+RadiationExposureIndicatedFlag+AnnualVACheckAmount+InstitutionSID+YearXMonth+DayOfWeek+TimeOut_bins, data=pcp_claims))

pcp_claims$residual_MH_bare <- resid(felm(MH_dummy_cum3_v1~0|InstitutionSID+YearXMonth+DayOfWeek+TimeOut_bins, data=pcp_claims))
pcp_claims$residual_PH_bare <- resid(felm(PH_dummy_cum3_v1~0|InstitutionSID+YearXMonth+DayOfWeek+TimeOut_bins, data=pcp_claims))
pcp_claims$residual_ACSC_bare <- resid(felm(ACSC_cum3~0|InstitutionSID+YearXMonth+DayOfWeek+TimeOut_bins, data=pcp_claims))

drift <- pcp_claims[, `:=` (Sum_Resid_MH_bare=sum(residual_MH_bare, na.rm=T), Sum_Resid_MH=sum(residual_MH, na.rm=T), 
                            Sum_Resid_PH_bare=sum(residual_PH_bare, na.rm=T), Sum_Resid_PH=sum(residual_PH, na.rm=T),
                            Sum_Resid_ACSC_bare=sum(residual_ACSC_bare, na.rm=T), Sum_Resid_ACSC=sum(residual_ACSC, na.rm=T),
                            AnnualCaseCount=length(PatientICN)), by=c("StaffSSN", "AssignYear")]

drift$Z_MH_bare_year <- (drift$Sum_Resid_MH_bare - drift$residual_MH_bare)/(drift$AnnualCaseCount - 1)
drift$Z_MH_year <- (drift$Sum_Resid_MH - drift$residual_MH)/(drift$AnnualCaseCount - 1)
drift$Z_PH_bare_year <- (drift$Sum_Resid_PH_bare - drift$residual_PH_bare)/(drift$AnnualCaseCount - 1)
drift$Z_PH_year <- (drift$Sum_Resid_PH - drift$residual_PH)/(drift$AnnualCaseCount - 1)
drift$Z_ACSC_bare_year <- (drift$Sum_Resid_ACSC_bare - drift$residual_ACSC_bare)/(drift$AnnualCaseCount - 1)
drift$Z_ACSC_year <- (drift$Sum_Resid_ACSC - drift$residual_ACSC)/(drift$AnnualCaseCount - 1)

drift <- drift[,c("PatientICN", "MH_dummy_cum3_v1", "PH_dummy_cum3_v1", "ACSC_cum3", "Z_MH_year", "Z_PH_year", "Z_ACSC_year", "Z_MH_bare_year", "Z_PH_bare_year", "Z_ACSC_bare_year", "StaffSSN", "AnnualCaseCount", "AssignYear")]

drift$N_bin <- "1"
drift$N_bin[which(drift$AnnualCaseCount>=10 & drift$AnnualCaseCount<25)] <- "2"
drift$N_bin[which(drift$AnnualCaseCount>=25 & drift$AnnualCaseCount<50)] <- "3"
drift$N_bin[which(drift$AnnualCaseCount>=50)] <- "4"

drift_wide <- reshape(drift, v.names=c("Z_MH_year", "Z_PH_year", "Z_ACSC_year", "Z_MH_bare_year", "Z_PH_bare_year", "Z_ACSC_bare_year", "N_bin"), idvar="StaffSSN", timevar="AssignYear", direct="wide")
drift_wide <- data.frame(drift_wide)
drift_wide[is.na(drift_wide)] <- 0
drift_wide <- data.table(drift_wide)

drift <- merge(x=drift, y=drift_wide[,-c("PatientICN", "MH_dummy_cum3_v1", "PH_dummy_cum3_v1", "ACSC_cum3")], by.x="StaffSSN", by.y="StaffSSN", all.x=T)

# Get 
drift$FE_MH_bare <- lm(MH_dummy_cum3_v1~N_bin.2005*Z_MH_bare_year.2005+N_bin.2006*Z_MH_bare_year.2006+N_bin.2007*Z_MH_bare_year.2007+N_bin.2008*Z_MH_bare_year.2008+N_bin.2009*Z_MH_bare_year.2009+N_bin.2010*Z_MH_bare_year.2010+N_bin.2011*Z_MH_bare_year.2011+N_bin.2012*Z_MH_bare_year.2012+N_bin.2013*Z_MH_bare_year.2013+N_bin.2014*Z_MH_bare_year.2014+N_bin.2015*Z_MH_bare_year.2015+N_bin.2016*Z_MH_bare_year.2016+N_bin.2017*Z_MH_bare_year.2017, data=drift)$fitted.values
drift$FE_MH <- lm(MH_dummy_cum3_v1~N_bin.2005*Z_MH_year.2005+N_bin.2006*Z_MH_year.2006+N_bin.2007*Z_MH_year.2007+N_bin.2008*Z_MH_year.2008+N_bin.2009*Z_MH_year.2009+N_bin.2010*Z_MH_year.2010+N_bin.2011*Z_MH_year.2011+N_bin.2012*Z_MH_year.2012+N_bin.2013*Z_MH_year.2013+N_bin.2014*Z_MH_year.2014+N_bin.2015*Z_MH_year.2015+N_bin.2016*Z_MH_year.2016+N_bin.2017*Z_MH_year.2017, data=drift)$fitted.values
drift$FE_PH_bare <- lm(PH_dummy_cum3_v1~N_bin.2005*Z_PH_bare_year.2005+N_bin.2006*Z_PH_bare_year.2006+N_bin.2007*Z_PH_bare_year.2007+N_bin.2008*Z_PH_bare_year.2008+N_bin.2009*Z_PH_bare_year.2009+N_bin.2010*Z_PH_bare_year.2010+N_bin.2011*Z_PH_bare_year.2011+N_bin.2012*Z_PH_bare_year.2012+N_bin.2013*Z_PH_bare_year.2013+N_bin.2014*Z_PH_bare_year.2014+N_bin.2015*Z_PH_bare_year.2015+N_bin.2016*Z_PH_bare_year.2016+N_bin.2017*Z_PH_bare_year.2017, data=drift)$fitted.values
drift$FE_PH <- lm(PH_dummy_cum3_v1~N_bin.2005*Z_PH_year.2005+N_bin.2006*Z_PH_year.2006+N_bin.2007*Z_PH_year.2007+N_bin.2008*Z_PH_year.2008+N_bin.2009*Z_PH_year.2009+N_bin.2010*Z_PH_year.2010+N_bin.2011*Z_PH_year.2011+N_bin.2012*Z_PH_year.2012+N_bin.2013*Z_PH_year.2013+N_bin.2014*Z_PH_year.2014+N_bin.2015*Z_PH_year.2015+N_bin.2016*Z_PH_year.2016+N_bin.2017*Z_PH_year.2017, data=drift)$fitted.values
drift$FE_ACSC_bare <- lm(ACSC_cum3~N_bin.2005*Z_ACSC_bare_year.2005+N_bin.2006*Z_ACSC_bare_year.2006+N_bin.2007*Z_ACSC_bare_year.2007+N_bin.2008*Z_ACSC_bare_year.2008+N_bin.2009*Z_ACSC_bare_year.2009+N_bin.2010*Z_ACSC_bare_year.2010+N_bin.2011*Z_ACSC_bare_year.2011+N_bin.2012*Z_ACSC_bare_year.2012+N_bin.2013*Z_ACSC_bare_year.2013+N_bin.2014*Z_ACSC_bare_year.2014+N_bin.2015*Z_ACSC_bare_year.2015+N_bin.2016*Z_ACSC_bare_year.2016+N_bin.2017*Z_ACSC_bare_year.2017, data=drift)$fitted.values
drift$FE_ACSC <- lm(ACSC_cum3~N_bin.2005*Z_ACSC_year.2005+N_bin.2006*Z_ACSC_year.2006+N_bin.2007*Z_ACSC_year.2007+N_bin.2008*Z_ACSC_year.2008+N_bin.2009*Z_ACSC_year.2009+N_bin.2010*Z_ACSC_year.2010+N_bin.2011*Z_ACSC_year.2011+N_bin.2012*Z_ACSC_year.2012+N_bin.2013*Z_ACSC_year.2013+N_bin.2014*Z_ACSC_year.2014+N_bin.2015*Z_ACSC_year.2015+N_bin.2016*Z_ACSC_year.2016+N_bin.2017*Z_ACSC_year.2017, data=drift)$fitted.values

pcp_claims <- merge(x=pcp_claims, y=drift[,c("PatientICN", "FE_MH_bare", "FE_MH", "FE_PH_bare", "FE_PH", "FE_ACSC_bare", "FE_ACSC")], by.x="PatientICN", by.y="PatientICN", all.x=T)
pcp_claims$FE_MH_std <- -scale(pcp_claims$FE_MH)
pcp_claims$FE_MH_bare_std <- -scale(pcp_claims$FE_MH_bare)
pcp_claims$FE_PH_std <- -scale(pcp_claims$FE_PH)
pcp_claims$FE_PH_bare_std <- -scale(pcp_claims$FE_PH_bare)
pcp_claims$FE_ACSC_std <- -scale(pcp_claims$FE_ACSC)
pcp_claims$FE_ACSC_bare_std <- -scale(pcp_claims$FE_ACSC_bare)

fwrite(pcp_claims, "~/pcp_claims.csv")
